library(dplyr)
library(ggplot2)
library(DESeq2)
## Warning: package 'GenomeInfoDb' was built under R version 4.1.1
## Warning: package 'MatrixGenerics' was built under R version 4.1.1
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)
library(fgsea)
Plasma 1:10
Viridis 1:10
Cividis 1:10
Magma 1:10
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
##This section of code will generate a summary table of the samples sequenced and their sequencing and alignment metrics
| Sample | patient | reads | %Q30 | Duplication rate | % reads with adapter | STAR alignment number | percent aligned | splices annotated | salmon mapping |
|---|---|---|---|---|---|---|---|---|---|
| Bulk | HTB314 | 71816976 | 94.3 | 57 | 2.6 | 63614504 | 88.58 | 27340654 | 85.6967 |
| Bulk | MDS268 | 31441538 | 93.0 | 24 | 2.3 | 25549835 | 81.26 | 5546270 | 58.2075 |
| Bulk | MDS280 | 28794421 | 93.0 | 24 | 2.3 | 23317205 | 80.98 | 5115546 | 58.8614 |
| CD123+ | HTB314 | 61568500 | 94.0 | 56 | 2.7 | 55359277 | 89.91 | 22831187 | 86.4197 |
| CD123+ | MDS268 | 32307881 | 93.0 | 27 | 3.3 | 26243441 | 81.23 | 7290861 | 64.2150 |
| CD123+ | MDS280 | 23745205 | 93.0 | 34 | 2.6 | 21035358 | 88.59 | 5668653 | 71.3245 |
| CD123- | HTB314 | 83260626 | 94.0 | 58 | 2.6 | 74804722 | 89.84 | 31876605 | 87.6505 |
| CD123- | MDS268 | 27917999 | 93.0 | 26 | 2.7 | 22835189 | 81.79 | 5793322 | 63.9284 |
| CD123- | MDS280 | 24767573 | 93.0 | 32 | 2.6 | 22467937 | 90.72 | 4773823 | 62.7193 |
##This section of code will import and format the data for DeSeq2
#to generate a vector of names and file locations
files<-file.path("~/Desktop/Jordan files/Counts/salmon/salmon/", list.files("~/Desktop/Jordan files/Counts/salmon/salmon/"), "quant.sf")
names(files)<-list.files("~/Desktop/Jordan files/Counts/salmon/salmon")
#to call in a gene_map this was derived by pulling it out of the fasta file with a grep command for ENST and ENSG and pasting them together
gene_map=read_csv("~/Desktop/Jordan files/Counts/salmon/gene_map.csv", col_names=c('enstid', 'ensgid'))
# import transcript level counts
txi.salmon.t<-tximport(files, type="salmon", txOut=TRUE)
txi.salmon.g<-tximport(files=files, type="salmon", tx2gene= gene_map, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
### this code works but if I remove the ignoreTxVersion I get an error, this may have to do with how I am generating my tx2gene file
# Extract counts only
counts <- txi.salmon.g$counts %>%
as.data.frame()
#Extract TPM
tpms <- data.frame(txi.salmon.g$abundance)
##for clients the counts and tpm files should be written out
##This chunk of code sets an expression threshold for TMP where all samples have at least 5 counts
expressed_genes<-tpms %>% filter_all(all_vars(. > 5))
##This chunk of code generates a heatmap using the spearman method as well as a correlation heatmap ## PCA plot
exp.pca <- prcomp(t(log2(expressed_genes)))
exp.pca.summary <- summary(exp.pca)$importance
pc1var = round(exp.pca.summary[3,1] * 100, 1)
pc2var = round(exp.pca.summary[3,2] * 100 - pc1var, 1)
exp.pca.pc <- data.frame(exp.pca$x, sample = colnames(expressed_genes))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## SampleName = col_character(),
## FileName = col_character(),
## patient = col_character(),
## cellType = col_character()
## )
## 3. Run Differential Expression testing using DESeq2 and Calculate Gene Set Enrichment ## Compare 123pos vs 123neg, 123neg vs bulk, and 123pos vs bulk #### sig = padj <0.01 and abs(l2fc) >0.5 ####
coldata<-dplyr::select(metadata, -FileName)
coldata<-column_to_rownames(coldata, 'SampleName')
## need to confirm that all names are in the same order
all(rownames(coldata) %in% colnames(txi.salmon.g$counts))
## [1] TRUE
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] FALSE
coldata<-coldata[colnames(txi.salmon.g$counts),]
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] TRUE
dds <- DESeqDataSetFromTximport(txi.salmon.g,
colData = coldata,
design = ~ patient + cellType)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using just counts from tximport
### unfiltered data
dds.unfiltered <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.unfiltered <- results( dds.unfiltered)
##filtered data changed this from dds[rowMins(counts(dds))>5,] to a less stringent filtering
keep<-rowSums(counts(dds))>=10
#dds.filtered<-dds[rowMins(counts(dds))>5,]
dds.filtered<-dds[keep,]
dds.filtered<-DESeq(dds.filtered)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.filtered<-results(dds.filtered)
###outcome the first filtering cut from 60000 genese to 14000 genes with the new filter 35000genes are left which is about 1/2
### need to subset to do pairwise comparison unclear if this keeps the patient design aspect
res_123posvs123neg_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123neg", "bulk"))
res_123posvs123neg_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"))
## look at the numbers of genes meeting threshold the log fold change call is not changing things
sum( res_123posvs123neg_unfiltered$pvalue < 0.01 & abs(res_123posvs123neg_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1405
sum( res_123posvsBulk_unfiltered$pvalue < 0.01 & abs(res_123posvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2382
sum(res_123negvsBulk_unfiltered$pvalue < 0.01 & abs(res_123negvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2336
sum( res_123posvs123neg_filtered$pvalue < 0.01 & abs(res_123posvs123neg_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1206
sum( res_123posvsBulk_filtered$pvalue < 0.01 & abs(res_123posvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2157
sum(res_123negvsBulk_filtered$pvalue < 0.01 & abs(res_123negvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2130
sum(res_123posvs123neg_filtered$padj < 0.1, na.rm=TRUE)
## [1] 1000
sum(res_123posvs123neg_unfiltered$padj < 0.1, na.rm=TRUE)
## [1] 1180
## Warning: Removed 19307 rows containing missing values (geom_point).
## Warning: Removed 20003 rows containing missing values (geom_point).
## Warning: Removed 20003 rows containing missing values (geom_point).
###MA plots
pca_data<-vst(dds, blind=T)
ntop=500
rv <- rowVars(assay(pca_data))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(pca_data)[select,]))
percentVar2 <- data.frame(percentVar=pca$sdev^2/sum(pca$sdev^2))%>%
mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df<-pca$x%>%data.frame()%>%mutate(type=metadata$cellType)
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")
pca_data2<-vst(dds.filtered, blind=T)
ntop=500
rv <- rowVars(assay(pca_data2))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca2 <- prcomp(t(assay(pca_data2)[select,]))
percentVar3 <- data.frame(percentVar=pca2$sdev^2/sum(pca2$sdev^2))%>%
mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df2<-pca2$x%>%data.frame()%>%mutate(type=metadata$cellType)
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
###GSEA analysis
##generate gene names to go with ensembl gene id as the pathways will use gene name
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host='www.ensembl.org')
t2g_hs <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
ens2gene <- t2g_hs[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)
##loading hallmark pathways and KEGG pathways
pathways.hallmark <- gmtPathways("~/Desktop/Jordan files/h.all.v7.4.symbols.gmt")
pathways.kegg<-gmtPathways("~/Desktop/Jordan files/c2.cp.kegg.v7.4.symbols.gmt")
##generating tidy data and adding it to the results file
##123pos vs 123neg analysis
res_posneg_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"),tidy=TRUE)
colnames(res_posneg_gsea)[1]<-"ensembl_gene_id"
res_posneg_gsea <- inner_join(res_posneg_gsea,ens2gene,by="ensembl_gene_id")
res_posneg_gsea2<-res_posneg_gsea%>%
dplyr::select(Gene, stat) %>%
na.omit() %>%
distinct() %>%
group_by(Gene) %>%
summarise(stat=mean(stat))
rank_posneg<-deframe(res_posneg_gsea2)
fgseaRes_posneg <- fgsea(pathways=pathways.hallmark, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_tidy <-fgseaRes_posneg%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_posneg_kegg_tidy <-fgseaRes_posneg_kegg%>%
as_tibble() %>%
arrange(desc(NES))
##123 pos vs bulk analysis
res_posbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"),tidy=TRUE)
colnames(res_posbulk_gsea)[1]<-"ensembl_gene_id"
res_posbulk_gsea <- inner_join(res_posbulk_gsea,ens2gene,by="ensembl_gene_id")
res_posbulk_gsea2<-res_posbulk_gsea%>%
dplyr::select(Gene, stat) %>%
na.omit() %>%
distinct() %>%
group_by(Gene) %>%
summarise(stat=mean(stat))
rank_posbulk<-deframe(res_posbulk_gsea2)
fgseaRes_posbulk<- fgsea(pathways=pathways.hallmark, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.26% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.26% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_tidy <-fgseaRes_posbulk%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_posbulk_kegg_tidy <-fgseaRes_posbulk_kegg%>%
as_tibble() %>%
arrange(desc(NES))
##123 neg vs bulk analysis
res_negbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"),tidy=TRUE)
colnames(res_negbulk_gsea)[1]<-"ensembl_gene_id"
res_negbulk_gsea <- inner_join(res_negbulk_gsea,ens2gene,by="ensembl_gene_id")
res_negbulk_gsea2<-res_negbulk_gsea%>%
dplyr::select(Gene, stat) %>%
na.omit() %>%
distinct() %>%
group_by(Gene) %>%
summarise(stat=mean(stat))
rank_negbulk<-deframe(res_negbulk_gsea2)
fgseaRes_negbulk<- fgsea(pathways=pathways.hallmark, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.4% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.4% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_tidy <-fgseaRes_negbulk%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_negbulk_kegg_tidy <-fgseaRes_negbulk_kegg%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_posneg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs 123neg.')
fgseaRes_posneg_kegg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs 123neg.')
fgseaRes_posbulk_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs bulk.')
fgseaRes_posbulk_kegg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs bulk.')
fgseaRes_negbulk_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: Hallmark genes with 123neg vs bulk.')
fgseaRes_posneg_kegg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: KEGG pathways with 123neg vs bulk.')
ggplot(fgseaRes_posneg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA 123pos vs 123neg") +
theme_minimal()
ggplot(fgseaRes_posneg_kegg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Kegg pathways NES from GSEA 123pos vs 123neg") +
theme_minimal()
ggplot(fgseaRes_posbulk_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA 123pos vs bulk") +
theme_minimal()
ggplot(fgseaRes_posbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Kegg pathways NES from GSEA 123pos vs bulk") +
theme_minimal()
ggplot(fgseaRes_negbulk_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA 123neg vs bulk") +
theme_minimal()
ggplot(fgseaRes_negbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Kegg pathways NES from GSEA 123neg vs bulk ") +
theme_minimal()